In [1]:
import numpy as np #linear algebra
import pandas as pd #data manipulation and analysis
import matplotlib.pyplot as plt #data visualization
import seaborn as sns #data visualization
from scipy.spatial import ConvexHull, convex_hull_plot_2d
import matplotlib.path as mpath
import plotly.express as px
import plotly.graph_objects as go
import textwrap

Explain what will be happening here¶

In [2]:
# read-in dataframe with cluster ids
cluster2_df = pd.read_csv("laborEduClusterData.csv", index_col=0)
cluster2_df.head()
Out[2]:
Bachelor's degree or higher of persons age 25 years+, 2017-2021 With a disability, under age 65 years, 2017-2021 Persons without health insurance, under age 65 years In civilian labor force, total of population age 16 years+, 2017-2021 In civilian labor force, female of population age 16 years+, 2017-2021 Total retail sales per capita, 2017 Mean travel time to work (minutes), workers age 16 years+, 2017-2021 Median household income (in 2021 dollars), 2017-2021 Per capita income in past 12 months (in 2021 dollars), 2017-2021 Persons in poverty Total employer establishments, 2020 Total employment, 2020 Total annual payroll, 2020 ($1,000) Population per square mile, 2020 Land area in square miles, 2020 cluster_id Banned or not County Name
0 4406.72 4434.26 2588.95 14184.13 14018.88 9461 36.1 44467 24539 5205.44 385 4572 167427 47.1 583.87 0 0.0 Adams County, Ohio
1 19418.97 12912.09 8540.28 61917.03 58765.26 16266 19.5 55114 28671 15555.51 2286 45012 1983272 253.9 402.55 0 1.0 Allen County, Ohio
2 11614.15 5179.28 4708.44 32592.87 29663.17 9431 24.2 58168 28992 5702.44 1034 18234 701075 124.0 422.99 0 0.0 Ashland County, Ohio
3 14697.89 12361.80 9733.70 55190.08 51296.60 10406 25.8 49680 26777 15281.91 1806 24464 913850 139.0 702.07 0 0.0 Ashtabula County, Ohio
4 21161.10 8005.22 5833.26 34316.97 33386.13 11148 21.9 47061 24990 12969.70 1029 13265 469095 124.0 503.64 0 0.0 Athens County, Ohio

All counties in each cluster¶

In [3]:
from functionsAll import split_dataframe_by_cluster, get_cluster_coords_dict,coords, clusterk_dict,\
cluster0, cluster1, cluster2
In [4]:
# 1
df_list = split_dataframe_by_cluster(cluster2_df, 'cluster_id')

# 2
coords_list = coords(df_list, cluster2_df)

# 3
clusterK_dict = clusterk_dict(df_list, coords_list)
In [5]:
for i in range(len(df_list)):
    if len(df_list) ==1:
        cluster0_dict = cluster0(clusterK_dict)
    elif len(df_list) ==2:
        cluster0_dict = cluster0(clusterK_dict)
        cluster1_dict = cluster1(clusterK_dict)
    elif len(df_list) ==3:
        cluster0_dict = cluster0(clusterK_dict)
        cluster1_dict = cluster1(clusterK_dict)
        cluster2_dict = cluster2(clusterK_dict)
        
    print(f"cluster{i}_dict have been made to dictionary")
        
        
cluster0_dict have been made to dictionary
cluster1_dict have been made to dictionary
cluster2_dict have been made to dictionary

Banned counties in each cluster¶

In [6]:
from functionsBanned import filter_banned_counties, get_banned_cluster_coords_dict, bannedCoords, clusterk_dict_banned,\
cluster0Banned, cluster1Banned, cluster2Banned
In [9]:
# 1
banned_counties_df = filter_banned_counties(cluster2_df)

# 2
bannedCoords_list = bannedCoords(banned_counties_df, cluster2_df)

# 3
clusterKBanned_dict = clusterk_dict_banned(banned_counties_df, bannedCoords_list)
Cluster0 had enough banned counties to find non-banned counties in the banned counties convex hull.
Cluster2 had enough banned counties to find non-banned counties in the banned counties convex hull.

Now to look at the counties inside the convex hull of the banned counties¶

In [10]:
from countyName import banned_counties_list, countyNames_cluster0, countyNames_cluster1, countyNames_cluster2, merge_dicts
In [11]:
bannedCountiesList=banned_counties_list(cluster2_df)


# Make into function
for i in range(len(banned_counties_df)):
    if len(banned_counties_df) ==1:
        cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
        allCounties=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)

    elif len(banned_counties_df) ==2:
        cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
        cluster1_banned_dict = cluster1Banned(clusterKBanned_dict)
        
        countyName0=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)
        countyName1=countyNames_cluster1(cluster2_df, cluster1_dict, cluster1_banned_dict, bannedCountiesList)
        
        allCounties = merge_dicts(countyName0, countyName1)

    elif len(banned_counties_df) ==3:
        cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
        cluster1_banned_dict = cluster1Banned(clusterKBanned_dict)
        cluster2_banned_dict = cluster2Banned(clusterKBanned_dict)
        
        countyName0=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)
        countyName1=countyNames_cluster1(cluster2_df, cluster1_dict, cluster1_banned_dict, bannedCountiesList)
        countyName2=countyNames_cluster0(cluster2_df, cluster2_dict, cluster2_banned_dict, bannedCountiesList)
        
    print(f"cluster{i}_banned_dict have been made to dictionary")
        
cluster0_banned_dict have been made to dictionary
cluster1_banned_dict have been made to dictionary

Anything above here is good

Plotting Column Pairs¶

In [12]:
from topVariableFunctions import filtered_var_pairs, categoryCountyList, subcategoryCountyList, commonKeys, freq_var
In [13]:
filt = filtered_var_pairs(allCounties)
In [14]:
categorycountyList = categoryCountyList(filt)
The category with the most counties is 'Mean travel time to work (minutes), workers age 16 years+, 2017-2021' with 26 counties.
In [15]:
subcategorycountyList = subcategoryCountyList(filt)
The subcategory with the most counties is 'Median household income (in 2021 dollars), 2017-2021' with 26 counties.
In [16]:
combinedVars = commonKeys(categorycountyList, subcategorycountyList)

After all variables are combined, we can now take a look at some of the counties that may be at risk the most common variable amoung them

Table of Demographic varibales and the counties that are in it the most¶

In [17]:
sorted_var_dict = dict(sorted(combinedVars.items(), key=lambda x: len(x[1]), reverse=True))
In [18]:
# Make ranking table of vars.
ranked_listTbl = [(i+1, item[0], len(item[1]), ", ".join([textwrap.fill(', '.join(textwrap.wrap(c, width=10)), width=80) for c in item[1]])) for i, item in enumerate(sorted_var_dict.items())]
topVarTbl = pd.DataFrame(ranked_listTbl)
topVarTbl.columns=['Rank', 'Demographic_variable', 'Number_of_counties', 'List_of_Counties']
In [19]:
# try to chnage lengh of cloumn
headerColor = 'grey'
rowEvenColor = 'lightgrey'
rowOddColor = 'white'


fig = go.Figure(data=[go.Table(
    columnwidth = [50,110,75,700],

    header=dict(
        values=['Rank', 'Demographic Variable', 'Number of Counties', 'List of Counties'],
                    line_color='darkslategray',
                    fill_color=headerColor,
                    align=['left','center'],
                    font=dict(color='white', size=12)
    ),
    cells=dict(
        values=[topVarTbl.Rank, topVarTbl.Demographic_variable, topVarTbl.Number_of_counties, topVarTbl.List_of_Counties],
               line_color='darkslategray',
    # 2-D list of colors for alternating rows
    fill_color = [[rowOddColor,rowEvenColor,rowOddColor, rowEvenColor,rowOddColor]*5],
    align = ['center', 'center', 'center', 'left'],
    font = dict(color = 'darkslategray', size = 11)
    ))
])

fig.show()

Plotting counties frequcy in each variable¶

Checking to see which counties had the most demographic variables. Then showing a table of what they are

In [20]:
# # Make ranking table of vars.
ranked_list = [(i+1, item[0], len(item[1]), item[1]) for i, item in enumerate(sorted_var_dict.items())]
newdf2 = pd.DataFrame(ranked_list)
newdf2.columns=['Rank', 'Demographic_variable', 'Number_of_counties', 'List_of_Counties']

Transforming newdf2 to become tidy:¶

In [21]:
# Separate the list of counties into individual rows
newdf2 = newdf2.explode('List_of_Counties')

# Drop the duplicate columns
newdf2 = newdf2.drop_duplicates(subset=['Rank', 'List_of_Counties'])

# Rename the columns for clarity
newdf3 = newdf2.rename(columns={
    'Demographic_variable': 'Demographic_Variable',
    'Number_of_counties': 'Number_of_Counties',
    'List_of_Counties': 'County'
})

# Display the updated data frame
newdf3.head()
Out[21]:
Rank Demographic_Variable Number_of_Counties County
0 1 Median household income (in 2021 dollars), 201... 34 Mercer
0 1 Median household income (in 2021 dollars), 201... 34 Morrow
0 1 Median household income (in 2021 dollars), 201... 34 Jefferson
0 1 Median household income (in 2021 dollars), 201... 34 Tuscarawas
0 1 Median household income (in 2021 dollars), 201... 34 Ashland
In [22]:
# Melting OG data:
melted_df = pd.melt(cluster2_df, id_vars=['County Name'], var_name='Attribute', value_name='Value')
melted_df['County Name'] = melted_df['County Name'].apply(lambda x: x[:-13])
melted_df
Out[22]:
County Name Attribute Value
0 Adams Bachelor's degree or higher of persons age 25 ... 4406.72
1 Allen Bachelor's degree or higher of persons age 25 ... 19418.97
2 Ashland Bachelor's degree or higher of persons age 25 ... 11614.15
3 Ashtabula Bachelor's degree or higher of persons age 25 ... 14697.89
4 Athens Bachelor's degree or higher of persons age 25 ... 21161.10
... ... ... ...
1491 Wood Banned or not 0.00
1492 Union Banned or not 1.00
1493 Wyandot Banned or not 0.00
1494 Summit Banned or not 0.00
1495 Van Wert Banned or not 0.00

1496 rows × 3 columns

In [23]:
tidy_df = pd.merge(newdf3, melted_df[['County Name', 'Attribute', 'Value']], left_on=['County','Demographic_Variable'],
                   right_on=['County Name', 'Attribute'])
tidy_df.drop(['Attribute','County Name'], axis=1, inplace=True)
tidy_df.head()
Out[23]:
Rank Demographic_Variable Number_of_Counties County Value
0 1 Median household income (in 2021 dollars), 201... 34 Mercer 68692.0
1 1 Median household income (in 2021 dollars), 201... 34 Morrow 63411.0
2 1 Median household income (in 2021 dollars), 201... 34 Jefferson 49211.0
3 1 Median household income (in 2021 dollars), 201... 34 Tuscarawas 57545.0
4 1 Median household income (in 2021 dollars), 201... 34 Ashland 58168.0
In [24]:
# create bar chart trace
fig = px.bar(tidy_df, x=tidy_df.Demographic_Variable, y=tidy_df.Number_of_Counties, text='Value', color='County',
             labels={"Attribute": "Attribute",
                     "Value": "Attribute Value"
                 },
             title="Default behavior: some text is tiny",)

fig.update_traces(textposition='inside')
#  update the layout to adjust the size of the plot
fig.update_layout(
    width=1200,  # set the width of the plot to 800 pixels
    height=1400,  # set the height of the plot to 600 pixels
)

# display the plot
fig.show()

Plotting counties frequcy in each variable¶

In [25]:
freq_var =freq_var(ranked_list)
In [26]:
# var_freq to dictionary
var_freq_dict = {item[0]: item[1] for i, item in enumerate(freq_var)}
In [27]:
# sorting var_freq_dict dictionary
sorted_var_freq_dict = dict(sorted(var_freq_dict.items(), key=lambda x: len(x[1]), reverse=True))
In [28]:
# Make ranking table of vars.
ranked_var_freqViz = [(i+1, item[0], len(item[1]), ", ".join([textwrap.fill(' '.join(textwrap.wrap(c, width=10)), width=80) for c in item[1]])) for i, item in enumerate(sorted_var_freq_dict.items())]
top_var_freqdfViz = pd.DataFrame(ranked_var_freqViz)
top_var_freqdfViz.columns=['Rank', 'County', 'Number_of_variables', 'List_of_variables']
In [29]:
headerColor = 'grey'
rowEvenColor = 'lightgrey'
rowOddColor = 'white'


fig = go.Figure(data=[go.Table(
    columnwidth = [40,70,100,600],

    header=dict(
        values=['Rank', 'County Name', 'Number of Demographic Variables', 'List of Demographics'],
                    line_color='darkslategray',
                    fill_color=headerColor,
                    align=['center','center', 'center', 'left'],
                    font=dict(color='white', size=12)
    ),
    cells=dict(
        values=[top_var_freqdfViz.Rank, top_var_freqdfViz.County, top_var_freqdfViz.Number_of_variables,
                top_var_freqdfViz.List_of_variables],
               line_color='darkslategray',
    # 2-D list of colors for alternating rows
    fill_color = [[rowOddColor,rowEvenColor,rowOddColor, rowEvenColor,rowOddColor]*5],
    align = ['center', 'center', 'center', 'left'],
    font = dict(color = 'darkslategray', size = 11)
    ))
])

fig.show()

Tidying for plot¶

In [30]:
# # Make ranking table of vars.
ranked_vars_list = [(i+1, item[0], len(item[1]), item[1]) for i, item in enumerate(sorted_var_freq_dict.items())]
top_var_freqdf = pd.DataFrame(ranked_vars_list)
top_var_freqdf.columns=['Rank', 'County', 'Number_of_variables', 'List_of_variables']
In [31]:
# Separate the list of counties into individual rows
test2 = top_var_freqdf.explode('List_of_variables')

# Drop the duplicate columns
test2 = test2.drop_duplicates(subset=['Rank', 'List_of_variables'])

# Rename the columns for clarity
test5 = test2.rename(columns={
    'List_of_variables': 'Demographic_Variable',
    'Number_of_variables': 'Number_of_variables',
    'County': 'County'
})

# Display the updated data frame
# test5.head()
In [32]:
tidy_df2 = pd.merge(test5, melted_df[['County Name', 'Attribute', 'Value']], left_on=['County','Demographic_Variable'],
                   right_on=['County Name', 'Attribute'])
In [33]:
# create bar chart trace
fig = px.bar(tidy_df2, x=tidy_df2.County, color='Demographic_Variable',text='Value',
             labels={"Attribute": "Attribute",
                     "Value": "Attribute Value",
                     "Demographic_Variable":"Demographic Var."
                 },
             title="Default behavior: some text is tiny",)

fig.update_traces(textposition='inside')
# update the layout to adjust the size of the plot
fig.update_layout(
    width=1500,  # set the width of the plot to 800 pixels
    height=500,  # set the height of the plot to 600 pixels
)

# display the plot
fig.show()

Most frequent Demographic variables¶

In [34]:
## Most common variables
res = sum(sorted_var_freq_dict.values(), [])
mostCommonVar = list(set(res))
mostCommonVar
Out[34]:
['Per capita income in past 12 months (in 2021 dollars), 2017-2021',
 'In civilian labor force, female of population age 16 years+, 2017-2021',
 'Total employment, 2020',
 'Population per square mile, 2020',
 'Total annual payroll, 2020 ($1,000)',
 'Persons in poverty',
 'Median household income (in 2021 dollars), 2017-2021',
 'Persons without health insurance, under age 65 years',
 'In civilian labor force, total of population age 16 years+, 2017-2021',
 'Mean travel time to work (minutes), workers age 16 years+, 2017-2021',
 'With a disability, under age 65 years, 2017-2021',
 'Total employer establishments, 2020',
 'Total retail sales per capita, 2017']

Most Frequent Counties¶

In [35]:
mostCommonCounties = top_var_freqdfViz['County'].tolist()
print(mostCommonCounties)
['Ashland', 'Sandusky', 'Huron', 'Fulton', 'Shelby', 'Muskingum', 'Belmont', 'Washington', 'Darke', 'Marion', 'Jefferson', 'Tuscarawas', 'Clinton', 'Guernsey', 'Pickaway', 'Erie', 'Wayne', 'Greene', 'Columbiana', 'Defiance', 'Athens', 'Ross', 'Mercer', 'Lawrence', 'Trumbull', 'Wyandot', 'Van Wert', 'Clark', 'Williams', 'Ashtabula', 'Scioto', 'Wood', 'Warren', 'Licking', 'Ottawa', 'Paulding', 'Preble', 'Henry', 'Richland', 'Champaign', 'Miami', 'Holmes', 'Clermont', 'Delaware', 'Morrow', 'Carroll', 'Hardin', 'Coshocton', 'Brown', 'Lake', 'Portage', 'Fairfield', 'Highland']

Plotting the counties at risk on a map of Ohio¶

In [36]:
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/minoritymajority.csv',
                   dtype={"fips": str})

# Filter the data frame to include only Ohio
df_ohio = df[df['STNAME'] == 'Ohio']

# Filter the counties GeoJSON file to include only Ohio counties
counties_ohio = {'type': 'FeatureCollection', 'features': []}
for feature in counties['features']:
    if feature['id'][:2] == '39':
        counties_ohio['features'].append(feature)
In [37]:
cluster2_df['TempCounty Name'] = cluster2_df['County Name'].apply(lambda x: x[:-6])
In [38]:
# merge the two data frames based on the 'County Name' and 'CTYNAME' columns
merged_df = pd.merge(cluster2_df, df_ohio[['CTYNAME', 'FIPS']], left_on='TempCounty Name', right_on='CTYNAME')

# drop the duplicate 'CTYNAME' column
merged_df.drop('CTYNAME', axis=1, inplace=True)
merged_df.drop('TempCounty Name', axis=1, inplace=True)
# print the merged data frame
merged_df.head()
Out[38]:
Bachelor's degree or higher of persons age 25 years+, 2017-2021 With a disability, under age 65 years, 2017-2021 Persons without health insurance, under age 65 years In civilian labor force, total of population age 16 years+, 2017-2021 In civilian labor force, female of population age 16 years+, 2017-2021 Total retail sales per capita, 2017 Mean travel time to work (minutes), workers age 16 years+, 2017-2021 Median household income (in 2021 dollars), 2017-2021 Per capita income in past 12 months (in 2021 dollars), 2017-2021 Persons in poverty Total employer establishments, 2020 Total employment, 2020 Total annual payroll, 2020 ($1,000) Population per square mile, 2020 Land area in square miles, 2020 cluster_id Banned or not County Name FIPS
0 4406.72 4434.26 2588.95 14184.13 14018.88 9461 36.1 44467 24539 5205.44 385 4572 167427 47.1 583.87 0 0.0 Adams County, Ohio 39001
1 19418.97 12912.09 8540.28 61917.03 58765.26 16266 19.5 55114 28671 15555.51 2286 45012 1983272 253.9 402.55 0 1.0 Allen County, Ohio 39003
2 11614.15 5179.28 4708.44 32592.87 29663.17 9431 24.2 58168 28992 5702.44 1034 18234 701075 124.0 422.99 0 0.0 Ashland County, Ohio 39005
3 14697.89 12361.80 9733.70 55190.08 51296.60 10406 25.8 49680 26777 15281.91 1806 24464 913850 139.0 702.07 0 0.0 Ashtabula County, Ohio 39007
4 21161.10 8005.22 5833.26 34316.97 33386.13 11148 21.9 47061 24990 12969.70 1029 13265 469095 124.0 503.64 0 0.0 Athens County, Ohio 39009
In [39]:
merged_df['TempCounty Name'] = merged_df['County Name'].apply(lambda x: x[:-13])
In [40]:
# Example list of counties to check
counties_to_check = merged_df['TempCounty Name'].tolist()

# Create a list to hold safe counties
safe_counties = []

# Check each county and append to the safe_counties list if not in either of the two lists
for county in counties_to_check:
    if county not in mostCommonCounties and county not in bannedCountiesList:
        safe_counties.append(county)

# Print the list of safe counties
print(safe_counties)
['Adams', 'Cuyahoga', 'Fayette', 'Franklin', 'Gallia', 'Geauga', 'Hamilton', 'Harrison', 'Hocking', 'Jackson', 'Lorain', 'Lucas', 'Madison', 'Mahoning', 'Meigs', 'Monroe', 'Montgomery', 'Morgan', 'Noble', 'Perry', 'Pike', 'Putnam', 'Stark', 'Vinton', 'Summit']
In [41]:
merged_df['risk'] = merged_df['TempCounty Name'].apply(lambda x: 'At risk' if x in mostCommonCounties else
                                                      'Already has Banned' if x in bannedCountiesList else
                                                      'Safe' if x in safe_counties else merged_df[merged_df['TempCounty Name'] == x]['risk'].values[0])
In [42]:
merged_df.head()
Out[42]:
Bachelor's degree or higher of persons age 25 years+, 2017-2021 With a disability, under age 65 years, 2017-2021 Persons without health insurance, under age 65 years In civilian labor force, total of population age 16 years+, 2017-2021 In civilian labor force, female of population age 16 years+, 2017-2021 Total retail sales per capita, 2017 Mean travel time to work (minutes), workers age 16 years+, 2017-2021 Median household income (in 2021 dollars), 2017-2021 Per capita income in past 12 months (in 2021 dollars), 2017-2021 Persons in poverty ... Total employment, 2020 Total annual payroll, 2020 ($1,000) Population per square mile, 2020 Land area in square miles, 2020 cluster_id Banned or not County Name FIPS TempCounty Name risk
0 4406.72 4434.26 2588.95 14184.13 14018.88 9461 36.1 44467 24539 5205.44 ... 4572 167427 47.1 583.87 0 0.0 Adams County, Ohio 39001 Adams Safe
1 19418.97 12912.09 8540.28 61917.03 58765.26 16266 19.5 55114 28671 15555.51 ... 45012 1983272 253.9 402.55 0 1.0 Allen County, Ohio 39003 Allen Already has Banned
2 11614.15 5179.28 4708.44 32592.87 29663.17 9431 24.2 58168 28992 5702.44 ... 18234 701075 124.0 422.99 0 0.0 Ashland County, Ohio 39005 Ashland At risk
3 14697.89 12361.80 9733.70 55190.08 51296.60 10406 25.8 49680 26777 15281.91 ... 24464 913850 139.0 702.07 0 0.0 Ashtabula County, Ohio 39007 Ashtabula At risk
4 21161.10 8005.22 5833.26 34316.97 33386.13 11148 21.9 47061 24990 12969.70 ... 13265 469095 124.0 503.64 0 0.0 Athens County, Ohio 39009 Athens At risk

5 rows × 21 columns

In [43]:
# One layer
import plotly.express as px
fig = px.choropleth(merged_df, geojson=counties_ohio, locations='FIPS', color='risk',
                    color_continuous_scale="Viridis",
                    range_color=(0, 2),
                    scope="usa",
                    hover_data=["County Name","Bachelor's degree or higher of persons age 25 years+, 2017-2021",
 'With a disability, under age 65 years, 2017-2021'],
                    labels={'risk':'Risk Level'})
                          
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
In [ ]:
 
In [ ]: